/*******************************************************************************
 Copyright (c) 2023 Arm  Corporation All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

   * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
   * Neither the name of Intel Corporation nor the names of its contributors
     may be used to endorse or promote products derived from this software
     without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

.arch armv8-a+crypto

.section .data

.align    8
.type    snow3g_invSR_SQ, %object
snow3g_invSR_SQ:
.byte   0xC2, 0xA6, 0x8F, 0x0A, 0x0D, 0xBE, 0xA7, 0x08
.byte   0x1D, 0x99, 0x45, 0x59, 0x13, 0xD2, 0x11, 0x9F
.byte   0xAE, 0xE6, 0xD4, 0xA4, 0x92, 0x8D, 0x58, 0xC1
.byte   0xD0, 0x97, 0xC8, 0x84, 0x9D, 0x4F, 0xBC, 0x3B
.byte   0x2D, 0xEB, 0x27, 0x53, 0x72, 0x4E, 0xE3, 0xEE
.byte   0xDA, 0x7F, 0xAA, 0x4D, 0x5C, 0x2F, 0x44, 0xDB
.byte   0x3E, 0x3A, 0x67, 0xC5, 0xC3, 0x6A, 0x16, 0x4C
.byte   0x38, 0xCC, 0xD7, 0xDD, 0x70, 0x62, 0xF2, 0x19
.byte   0x10, 0x09, 0x98, 0x4B, 0x61, 0xC9, 0x86, 0x03
.byte   0xA8, 0x6B, 0x5A, 0x33, 0x6E, 0x54, 0x5D, 0x8C
.byte   0x41, 0x1A, 0xF7, 0xF6, 0x82, 0xC6, 0xF8, 0x80
.byte   0xC0, 0xC7, 0xFE, 0xB3, 0x65, 0x2C, 0x7B, 0xBA
.byte   0xB4, 0xFC, 0x2A, 0x22, 0x0C, 0x73, 0xF5, 0x5F
.byte   0x64, 0x68, 0x2E, 0x94, 0xB2, 0x24, 0x35, 0x14
.byte   0x78, 0xFB, 0xBF, 0x48, 0xDE, 0xED, 0x43, 0x07
.byte   0xB6, 0x32, 0xE4, 0xBD, 0x74, 0x7D, 0x57, 0x46
.byte   0x3C, 0x37, 0xC4, 0xB7, 0x51, 0x8A, 0xF3, 0x55
.byte   0x6C, 0xCF, 0x79, 0xAB, 0x77, 0xA3, 0xE1, 0x93
.byte   0xD5, 0x6D, 0x81, 0x5B, 0x2B, 0x9A, 0x7E, 0x8B
.byte   0x04, 0xB5, 0x85, 0xD3, 0x91, 0xA1, 0x47, 0x52
.byte   0xA5, 0xEC, 0xD6, 0xBB, 0x20, 0x87, 0x26, 0xF0
.byte   0xAF, 0x4A, 0x89, 0xF4, 0xCE, 0x25, 0xCB, 0x50
.byte   0x00, 0x3F, 0xD9, 0x42, 0x90, 0x21, 0x3D, 0xA9
.byte   0xE7, 0x29, 0x01, 0xF1, 0x36, 0x5E, 0xFA, 0xCD
.byte   0xE5, 0x31, 0x1B, 0x05, 0xFD, 0x9E, 0xA0, 0x76
.byte   0x30, 0xB1, 0x75, 0xB0, 0x9B, 0x56, 0xEA, 0x1C
.byte   0xEF, 0x06, 0x69, 0x7A, 0x95, 0x88, 0x15, 0xFF
.byte   0xCA, 0xAC, 0x0E, 0x23, 0xD8, 0x0F, 0x28, 0x0B
.byte   0x18, 0xF9, 0x63, 0x1E, 0x83, 0x66, 0x39, 0x9C
.byte   0xE2, 0x49, 0x1F, 0xE8, 0xD1, 0x34, 0x7C, 0xA2
.byte   0xB9, 0xE0, 0x02, 0x12, 0xE9, 0xDF, 0xAD, 0x71
.byte   0x96, 0x8E, 0x6F, 0xB8, 0x40, 0x60, 0x17, 0xDC
.size    snow3g_invSR_SQ,.-snow3g_invSR_SQ

.align    8
.type    snow3g_MULa, %object
snow3g_MULa:
.byte   0x00, 0x13, 0x26, 0x35, 0x4C, 0x5F, 0x6A, 0x79
.byte   0x98, 0x8B, 0xBE, 0xAD, 0xD4, 0xC7, 0xF2, 0xE1
.byte   0x00, 0xCF, 0x37, 0xF8, 0x6E, 0xA1, 0x59, 0x96
.byte   0xDC, 0x13, 0xEB, 0x24, 0xB2, 0x7D, 0x85, 0x4A
.byte   0x00, 0x9F, 0x97, 0x08, 0x87, 0x18, 0x10, 0x8F
.byte   0xA7, 0x38, 0x30, 0xAF, 0x20, 0xBF, 0xB7, 0x28
.byte   0x00, 0xE1, 0x6B, 0x8A, 0xD6, 0x37, 0xBD, 0x5C
.byte   0x05, 0xE4, 0x6E, 0x8F, 0xD3, 0x32, 0xB8, 0x59
.byte   0x00, 0x99, 0x9B, 0x02, 0x9F, 0x06, 0x04, 0x9D
.byte   0x97, 0x0E, 0x0C, 0x95, 0x08, 0x91, 0x93, 0x0A
.byte   0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77
.byte   0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF
.byte   0x00, 0xE7, 0x67, 0x80, 0xCE, 0x29, 0xA9, 0x4E
.byte   0x35, 0xD2, 0x52, 0xB5, 0xFB, 0x1C, 0x9C, 0x7B
.byte   0x00, 0x0A, 0x14, 0x1E, 0x28, 0x22, 0x3C, 0x36
.byte   0x50, 0x5A, 0x44, 0x4E, 0x78, 0x72, 0x6C, 0x66
.size    snow3g_MULa,.-snow3g_MULa

.align    8
.type    snow3g_DIVa, %object
snow3g_DIVa:
.byte   0x00, 0xCD, 0x33, 0xFE, 0x66, 0xAB, 0x55, 0x98
.byte   0xCC, 0x01, 0xFF, 0x32, 0xAA, 0x67, 0x99, 0x54
.byte   0x00, 0x40, 0x80, 0xC0, 0xA9, 0xE9, 0x29, 0x69
.byte   0xFB, 0xBB, 0x7B, 0x3B, 0x52, 0x12, 0xD2, 0x92
.byte   0x00, 0x0F, 0x1E, 0x11, 0x3C, 0x33, 0x22, 0x2D
.byte   0x78, 0x77, 0x66, 0x69, 0x44, 0x4B, 0x5A, 0x55
.byte   0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48
.byte   0xC0, 0xD8, 0xF0, 0xE8, 0xA0, 0xB8, 0x90, 0x88
.byte   0x00, 0x31, 0x62, 0x53, 0xC4, 0xF5, 0xA6, 0x97
.byte   0x21, 0x10, 0x43, 0x72, 0xE5, 0xD4, 0x87, 0xB6
.byte   0x00, 0x5F, 0xBE, 0xE1, 0xD5, 0x8A, 0x6B, 0x34
.byte   0x03, 0x5C, 0xBD, 0xE2, 0xD6, 0x89, 0x68, 0x37
.byte   0x00, 0xF0, 0x49, 0xB9, 0x92, 0x62, 0xDB, 0x2B
.byte   0x8D, 0x7D, 0xC4, 0x34, 0x1F, 0xEF, 0x56, 0xA6
.byte   0x00, 0x29, 0x52, 0x7B, 0xA4, 0x8D, 0xF6, 0xDF
.byte   0xE1, 0xC8, 0xB3, 0x9A, 0x45, 0x6C, 0x17, 0x3E
.size    snow3g_DIVa,.-snow3g_DIVa

.align    6
.type    n_inv_aes_shift_row, %object
n_inv_aes_shift_row:
.byte    0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
.byte    0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
.size    n_inv_aes_shift_row,.-n_inv_aes_shift_row

.align    6
.type    ror8, %object
ror8:
.word    0x00030201, 0x04070605, 0x080b0a09, 0x0c0f0e0d
.size    ror8,.-ror8

.align    6
.type    gather_clear_mask_mul, %object
gather_clear_mask_mul:
.byte   0x03, 0x07, 0x0b, 0x0f, 0xff, 0xff, 0xff, 0xff
.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.size    gather_clear_mask_mul,.-gather_clear_mask_mul

.align    6
.type    gather_clear_mask_div, %object
gather_clear_mask_div:
.byte   0x00, 0x04, 0x08, 0x0c, 0xff, 0xff, 0xff, 0xff
.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.size    gather_clear_mask_div,.-gather_clear_mask_div

.align    6
.type    iv_swap_mask, %object
iv_swap_mask:
.quad    0x0405060700010203, 0x0c0d0e0f08090a0b
.size    iv_swap_mask,.-iv_swap_mask

.section .text

#define START_FUNC(fn) .globl fn; \
        .type fn, %function; \
        .align 16; \
        fn:

#define END_FUNC(fn) .size fn,.-fn

.macro declare_register name:req, reg:req
.ifdef def_\name
    .unreq    \name
.endif
    .set def_\name, 0
    \name    .req    \reg
.endm

declare_register LFSR_S0, v12
declare_register LFSR_S1, v13
declare_register LFSR_S2, v14
declare_register LFSR_S3, v15
declare_register LFSR_S4, v16
declare_register LFSR_S5, v17
declare_register LFSR_S6,  v18
declare_register LFSR_S7,  v19
declare_register LFSR_S8,  v20
declare_register LFSR_S9,  v21
declare_register LFSR_S10, v22
declare_register LFSR_S11, v23
declare_register LFSR_S12, v24
declare_register LFSR_S13, v25
declare_register LFSR_S14, v26
declare_register LFSR_S15, v27
declare_register FSM_R1,  v28
declare_register FSM_R2,  v29
declare_register FSM_R3,  v30
declare_register vINV_SHIFT_ROW, v31
declare_register vTMP0, v0
declare_register vTMP1, v1
declare_register vTMP2, v2
declare_register vTMP3, v3
declare_register vTMP4, v4
declare_register vTMP5, v5
declare_register vTMP6, v6
declare_register vTMP7, v7
declare_register vTMP8, v8
declare_register vTMP9, v9
declare_register vTMP10, v10
declare_register vTMP11, v11
declare_register xTMP0, x9
declare_register xTMP1, x10
declare_register xTMP2, x11
declare_register xTMP3, x12
declare_register xTMP4, x13
declare_register xTMP5, x14
declare_register xTMP6, x15
declare_register xTMP7, x16
declare_register xTMP8, x17
declare_register xTMP9, x18
declare_register xTMP10, x19
declare_register xTMP11, x20
declare_register xTMP12, x21
declare_register xTMP13, x22
declare_register xTMP14, x23
declare_register xTMP15, x24
declare_register xTMP16, x25
declare_register xTMP17, x26
declare_register xTMP18, x27
declare_register xTMP19, x28
declare_register wTMP12, w21
declare_register wTMP13, w22
declare_register wTMP14, w23
declare_register wTMP15, w24
declare_register wTMP16, w25
declare_register wTMP17, w26
declare_register wTMP18, w27
declare_register wTMP19, w28

.macro FUNC_SCALAR_SAVE
    stp x19, x20, [sp, -80]!
    stp x21, x22, [sp, 16]
    stp x23, x24, [sp, 32]
    stp x25, x26, [sp, 48]
    stp x27, x28, [sp, 64]
.endm

.macro FUNC_SCALAR_RESTORE
    ldp x21, x22, [sp, 16]
    ldp x23, x24, [sp, 32]
    ldp x25, x26, [sp, 48]
    ldp x27, x28, [sp, 64]
    ldp x19, x20, [sp], 80
.endm

.macro FUNC_VECTOR_SAVE
    stp d8, d9, [sp, -64]!
    stp d10, d11, [sp, 16]
    stp d12, d13, [sp, 32]
    stp d14, d15, [sp, 48]
.endm

.macro FUNC_VECTOR_RESTORE
    ldp d10, d11, [sp, 16]
    ldp d12, d13, [sp, 32]
    ldp d14, d15, [sp, 48]
    ldp d8, d9, [sp], 64
.endm

/*
 * S1_BOX_4_NEON()
 *
 * params
 *     \x - input value
 *     \rslt - returen value
 * uses
 *     vTMP0
 */
.macro S1_BOX_4_NEON x, rslt
    tbl \rslt\().16B, {\x\().16B}, vINV_SHIFT_ROW.16B
    movi vTMP0.16B, #0
    aese \rslt\().16B, vTMP0.16B
    aesmc \rslt\().16B, \rslt\().16B
.endm

/*
 * LOOKUP_16X8BIT_NEON()
 *
 * params
 *     \index  - input value
 *     \lookup - lookup table
 *     \rslt   - return value
 * uses
 *     vTMP0-2
 */
.macro LOOKUP_16X8BIT_NEON index, lookup, rslt
    movi vTMP0.16B, #32

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()], #32
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
    sub \index\().16B, \index\().16B, vTMP0.16B

    ld1 {vTMP1.16B, vTMP2.16B}, [\lookup\()]
    tbx \rslt\().16B, {vTMP1.16B, vTMP2.16B}, \index\().16B
.endm

/*
 * S2_MIXC_FIXUP_4_NEON()
 *
 * params
 *     \no_mixc  - input value
 *     \mixc     - lookup table
 *     \rslt     - return value
 * uses
 *     xTMP0, vTMP0-1
 */
.macro S2_MIXC_FIXUP_4_NEON no_mixc, mixc, rslt
    // PAT = CMLT(NO_MIXC);
    cmlt vTMP1.16B, \no_mixc\().16B, #0
    // PAT_SHUF = TBL(PAT, ROR8);
    adrp xTMP0, ror8
    add xTMP0, xTMP0, #:lo12:ror8
    ld1 {vTMP0.16B}, [xTMP0]
    tbl vTMP0.16B, {vTMP1.16B}, vTMP0.16B
    // RSLT = MIXC ^ (0X72 AND (PAT ^ PAT_SHUF))
    eor vTMP1.16B, vTMP1.16B, vTMP0.16B
    movi vTMP0.16B, #0x72
    and vTMP0.16B, vTMP0.16B, vTMP1.16B
    eor \rslt\().16B, vTMP0.16B, \mixc\().16B
.endm

/*
 * S2_BOX_4_NEON()
 *
 * params
 *     \x    - input value
 *     \rslt - return value
 * uses
 *     xTMP0, vTMP0-3
 */
.macro S2_BOX_4_NEON x, rslt
    // NEW_X = TBL(LOOKUP(X, snow3g_invSR_SQ), inv_aes_shift_row);
    adrp xTMP0, snow3g_invSR_SQ
    add xTMP0, xTMP0, #:lo12:snow3g_invSR_SQ
    LOOKUP_16X8BIT_NEON \x\(), xTMP0, \rslt\()
    // NOMIXC = AESE(NEW_X, 0)
    tbl vTMP3.16B, {\rslt\().16B}, vINV_SHIFT_ROW.16B
    movi vTMP2.16B, #0
    aese vTMP3.16B, vTMP2.16B
    // MIXC = AESMC(NOMIXC)
    aesmc vTMP2.16B, vTMP3.16B
    // S2_MIXC_FIXUP(NOMIXC, MIXC)
    S2_MIXC_FIXUP_4_NEON vTMP3, vTMP2, \rslt\()
.endm

/*
 * MUL_DIV_A_4_NEON()
 *
 * params
 *     \S    - input value, S0 or S11
 *     \rslt - return value
 * uses
 *     xTMP0, vTMP0-4
 */
.macro MUL_DIV_A_4_NEON MUL_OR_DIV S, rslt
    // L = S0,3  & 0x0F
    // L = S11,3 & 0x0F
    movi vTMP0.16B, #0x0F
.ifc \MUL_OR_DIV, MUL
    adrp xTMP0, gather_clear_mask_mul
    add xTMP0, xTMP0, #:lo12:gather_clear_mask_mul
.else
    adrp xTMP0, gather_clear_mask_div
    add xTMP0, xTMP0, #:lo12:gather_clear_mask_div
.endif
    ld1 {vTMP1.16B}, [xTMP0]

    // TL = TBL8(MUL/DIVa_B0, L) || TBL8(MUL/DIVa_B1, L) ||
    //      TBL8(MUL/DIVa_B2, L) || TBL8(MUL/DIVa_B3, L)
    tbl vTMP1.16B, {\S\().16B}, vTMP1.16B
    and vTMP0.16B, vTMP1.16B, vTMP0.16B
.ifc \MUL_OR_DIV, MUL
    adrp xTMP0, snow3g_MULa
    add xTMP0, xTMP0, #:lo12:snow3g_MULa
.else
    adrp xTMP0, snow3g_DIVa
    add xTMP0, xTMP0, #:lo12:snow3g_DIVa
.endif
    ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32
    tbl vTMP2.16B, {vTMP2.16B}, vTMP0.16B
    tbl vTMP3.16B, {vTMP3.16B}, vTMP0.16B
    zip1 vTMP2.16B, vTMP2.16B, vTMP3.16B
    ld1 {vTMP3.16b,vTMP4.16b},[xTMP0],#32
    tbl vTMP3.16B, {vTMP3.16B}, vTMP0.16B
    tbl vTMP4.16B, {vTMP4.16B}, vTMP0.16B
    zip1 vTMP3.16B, vTMP3.16B, vTMP4.16B
    zip1 vTMP0.8H, vTMP2.8H, vTMP3.8H

    // H = S0,3 & 0xF0
    ushr vTMP1.16B, vTMP1.16B, #4

    // TH = TBL8(MUL/DIVa_B4, H) || TBL8(MUL/DIVa_B5, H) ||
    //      TBL8(MUL/DIVa_B6, H) || TBL8(MUL/DIVa_B7, H)
    ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32
    tbl vTMP2.16B, {vTMP2.16B}, vTMP1.16B
    tbl vTMP3.16B, {vTMP3.16B}, vTMP1.16B
    zip1 vTMP2.16B, vTMP2.16B, vTMP3.16B
    ld1 {vTMP3.16b,vTMP4.16b},[xTMP0]
    tbl vTMP3.16B, {vTMP3.16B}, vTMP1.16B
    tbl vTMP4.16B, {vTMP4.16B}, vTMP1.16B
    zip1 vTMP3.16B, vTMP3.16B, vTMP4.16B
    zip1 vTMP1.8H, vTMP2.8H, vTMP3.8H

    // RSLT = TL ^ TH
    eor \rslt\().16B, vTMP1.16B, vTMP0.16B
.endm

/*
 * CLOCK_FSM_8_NEON()
 *
 * params
 *     \F - input value
 * uses
 *     xTMP0, vTMP0-4
 */
.macro CLOCK_FSM_4_NEON F
    // F = (S15 + R1) ^ R2
    // R = R2 + (R3 ^ S5)
    add \F\().4S, LFSR_S15.4S, FSM_R1.4S
    eor vTMP4.16B, LFSR_S5.16B, FSM_R3.16B
    eor \F\().16B, \F\().16B, FSM_R2.16B
    add vTMP4.4S, vTMP4.4S, FSM_R2.4S
    // R3 = S2(R2);
    S2_BOX_4_NEON FSM_R2, FSM_R3
    // R2 = S1(R1);
    S1_BOX_4_NEON FSM_R1, FSM_R2
    // R1 = R;
    mov FSM_R1.16B, vTMP4.16B
.endm

/*
 * SHIFT_LFSR_4_NEON()
 *
 * uses
 *     vTMP0-2
 */
.macro SHIFT_LFSR_4_NEON S15
    mov vTMP0.16B, LFSR_S4.16B
    mov vTMP1.16B, LFSR_S8.16B
    mov vTMP2.16B, LFSR_S12.16B

    mov LFSR_S0.16B, LFSR_S1.16B
    mov LFSR_S4.16B, LFSR_S5.16B
    mov LFSR_S8.16B, LFSR_S9.16B
    mov LFSR_S12.16B, LFSR_S13.16B

    mov LFSR_S1.16B, LFSR_S2.16B
    mov LFSR_S5.16B, LFSR_S6.16B
    mov LFSR_S9.16B, LFSR_S10.16B
    mov LFSR_S13.16B, LFSR_S14.16B

    mov LFSR_S2.16B, LFSR_S3.16B
    mov LFSR_S6.16B, LFSR_S7.16B
    mov LFSR_S10.16B, LFSR_S11.16B
    mov LFSR_S14.16B, LFSR_S15.16B

    mov LFSR_S3.16B, vTMP0.16B
    mov LFSR_S7.16B, vTMP1.16B
    mov LFSR_S11.16B, vTMP2.16B
    mov LFSR_S15.16B, \S15\().16B
.endm

/*
 * CLOCK_LFSR_4_NEON()
 *
 * uses
 *     xTMP0, vTMP0-6
 */
.macro CLOCK_LFSR_4_NEON
    // V = (S0<<8) ^ MULa(S0) ^ S2 ^ (S11>>8) ^ DIVa(S11)
    MUL_DIV_A_4_NEON MUL LFSR_S0, vTMP5
    MUL_DIV_A_4_NEON DIV LFSR_S11, vTMP6
    shl vTMP0.4S, LFSR_S0.4S, #8
    ushr vTMP1.4S, LFSR_S11.4S, #8
    eor vTMP5.16B, vTMP5.16B, vTMP6.16B
    eor vTMP0.16B, vTMP0.16B, vTMP1.16B
    eor vTMP5.16B, vTMP5.16B, vTMP0.16B
    eor vTMP3.16B, vTMP5.16B, LFSR_S2.16B
    SHIFT_LFSR_4_NEON vTMP3
.endm

/*
 * SNOW3G_KEYSTREAM_4_4_NEON()
 *
 * params
 *     \KEY - output keystream
 * uses
 *     xTMP0, vTMP0-6
 */
.macro SNOW3G_KEYSTREAM_4_4_NEON KEY
    CLOCK_FSM_4_NEON \KEY\()
    eor \KEY\().16B, \KEY\().16B, LFSR_S0.16B
    CLOCK_LFSR_4_NEON
.endm

/*
 * INTERLEAVE_IV_KEY_4()
 *
 * uses
 *     vTMP0-3 when SWAP == 0
 *     xTMP0, vTMP0-4 when SWAP == 1
 */
.macro INTERLEAVE_IV_KEY_4 SWAP RSLT0, RSLT1, RSLT2, RSLT3, \
                                ADDR1, ADDR2, ADDR3, ADDR4
    ld1 {\RSLT0\().4S}, [\ADDR1\()]
    ld1 {\RSLT1\().4S}, [\ADDR2\()]
    ld1 {\RSLT2\().4S}, [\ADDR3\()]
    ld1 {\RSLT3\().4S}, [\ADDR4\()]
.if \SWAP == 1
    adrp xTMP0, iv_swap_mask
    add xTMP0, xTMP0, #:lo12:iv_swap_mask
    ld1 {vTMP4.4S}, [xTMP0]
    tbl \RSLT0\().16B, {\RSLT0\().16B}, vTMP4.16B
    tbl \RSLT1\().16B, {\RSLT1\().16B}, vTMP4.16B
    tbl \RSLT2\().16B, {\RSLT2\().16B}, vTMP4.16B
    tbl \RSLT3\().16B, {\RSLT3\().16B}, vTMP4.16B
.endif
    zip1 vTMP0.4S, \RSLT0\().4S, \RSLT1\().4S
    zip2 vTMP1.4S, \RSLT0\().4S, \RSLT1\().4S
    zip1 vTMP2.4S, \RSLT2\().4S, \RSLT3\().4S
    zip2 vTMP3.4S, \RSLT2\().4S, \RSLT3\().4S

    zip1 \RSLT0\().2D, vTMP0.2D, vTMP2.2D
    zip2 \RSLT1\().2D, vTMP0.2D, vTMP2.2D
    zip1 \RSLT2\().2D, vTMP1.2D, vTMP3.2D
    zip2 \RSLT3\().2D, vTMP1.2D, vTMP3.2D
.endm

/*
 * SNOW3G_INITIALIZE_4_NEON_FIRST()
 *
 * uses
 *     vTMP0-8
 */
.macro SNOW3G_INITIALIZE_4_NEON_FIRST KEYADDR1 KEYADDR2 KEYADDR3 KEYADDR4 \
                                      IVADDR1 IVADDR2 IVADDR3 IVADDR4

    INTERLEAVE_IV_KEY_4 0, vTMP4, vTMP5, vTMP6, vTMP7, \
                        \KEYADDR1\(), \KEYADDR2\(), \KEYADDR3\(), \KEYADDR4\()
    mov LFSR_S4.16B, vTMP4.16B
    mov LFSR_S5.16B, vTMP5.16B
    mov LFSR_S6.16B, vTMP6.16B
    mov LFSR_S7.16B, vTMP7.16B
    mov LFSR_S12.16B, vTMP4.16B
    mov LFSR_S13.16B, vTMP5.16B
    mov LFSR_S14.16B, vTMP6.16B
    mov LFSR_S15.16B, vTMP7.16B
    not LFSR_S0.16B, vTMP4.16B
    not LFSR_S1.16B, vTMP5.16B
    not LFSR_S2.16B, vTMP6.16B
    not LFSR_S3.16B, vTMP7.16B
    mov LFSR_S8.16B, LFSR_S0.16B
    mov LFSR_S9.16B, LFSR_S1.16B
    mov LFSR_S10.16B, LFSR_S2.16B
    mov LFSR_S11.16B, LFSR_S3.16B

    INTERLEAVE_IV_KEY_4 1, vTMP5, vTMP6, vTMP7, vTMP8, \
                        \IVADDR1\(), \IVADDR2\(), \IVADDR3\(), \IVADDR4\()

    eor LFSR_S15.16B, LFSR_S15.16B, vTMP8.16B
    eor LFSR_S12.16B, LFSR_S12.16B, vTMP7.16B
    eor LFSR_S10.16B, LFSR_S10.16B, vTMP6.16B
    eor LFSR_S9.16B, LFSR_S9.16B, vTMP5.16B

    movi FSM_R1.16B, #0
    movi FSM_R2.16B, #0
    movi FSM_R3.16B, #0
.endm

/*
 * SNOW3G_INITIALIZE_4_NEON_SECOND()
 *
 * uses
 *     xTMP0, vTMP0-7
 */
.macro SNOW3G_INITIALIZE_4_NEON_SECOND
.rept 32
    CLOCK_FSM_4_NEON vTMP7
    CLOCK_LFSR_4_NEON
    eor LFSR_S15.16B, LFSR_S15.16B, vTMP7.16B
.endr
    CLOCK_FSM_4_NEON vTMP7
    CLOCK_LFSR_4_NEON
.endm

/*
 * SNOW3G_LOAD_CTX_4_NEON()
 *
 */
.macro SNOW3G_LOAD_CTX_4_NEON ctx_addr
    ld1 {LFSR_S0.16B, LFSR_S1.16B, LFSR_S2.16B, LFSR_S3.16B}, [\ctx_addr\()], #64
    ld1 {LFSR_S4.16B, LFSR_S5.16B, LFSR_S6.16B, LFSR_S7.16B}, [\ctx_addr\()], #64
    ld1 {LFSR_S8.16B, LFSR_S9.16B, LFSR_S10.16B, LFSR_S11.16B}, [\ctx_addr\()], #64
    ld1 {LFSR_S12.16B, LFSR_S13.16B, LFSR_S14.16B, LFSR_S15.16B}, [\ctx_addr\()], #64
    ld1 {FSM_R1.16B, FSM_R2.16B, FSM_R3.16B}, [\ctx_addr\()]
    sub \ctx_addr\(), \ctx_addr\(), #256
.endm

/*
 * SNOW3G_STORE_CTX_4_NEON()
 *
 */
.macro SNOW3G_STORE_CTX_4_NEON ctx_addr
    st1 {LFSR_S0.16B, LFSR_S1.16B, LFSR_S2.16B, LFSR_S3.16B}, [\ctx_addr\()], #64
    st1 {LFSR_S4.16B, LFSR_S5.16B, LFSR_S6.16B, LFSR_S7.16B}, [\ctx_addr\()], #64
    st1 {LFSR_S8.16B, LFSR_S9.16B, LFSR_S10.16B, LFSR_S11.16B}, [\ctx_addr\()], #64
    st1 {LFSR_S12.16B, LFSR_S13.16B, LFSR_S14.16B, LFSR_S15.16B}, [\ctx_addr\()], #64
    st1 {FSM_R1.16B, FSM_R2.16B, FSM_R3.16B}, [\ctx_addr\()], #48
    str wzr, [\ctx_addr\()]
    sub \ctx_addr\(), \ctx_addr\(), #304
.endm

/*
 * CLEAR_VECTORS_NEON()
 *
 */
.macro CLEAR_VECTORS_NEON
    eor vTMP0.16B, vTMP0.16B, vTMP0.16B
    eor vTMP1.16B, vTMP1.16B, vTMP1.16B
    eor vTMP2.16B, vTMP2.16B, vTMP2.16B
    eor vTMP3.16B, vTMP3.16B, vTMP3.16B
    eor vTMP4.16B, vTMP4.16B, vTMP4.16B
    eor vTMP5.16B, vTMP5.16B, vTMP5.16B
    eor vTMP6.16B, vTMP6.16B, vTMP6.16B
    eor vTMP7.16B, vTMP7.16B, vTMP7.16B
    eor vTMP8.16B, vTMP8.16B, vTMP8.16B
    eor vTMP9.16B, vTMP9.16B, vTMP9.16B
    eor vTMP10.16B, vTMP10.16B, vTMP10.16B
    eor vTMP11.16B, vTMP11.16B, vTMP11.16B
    eor LFSR_S0.16B, LFSR_S0.16B, LFSR_S0.16B
    eor LFSR_S1.16B, LFSR_S1.16B, LFSR_S1.16B
    eor LFSR_S2.16B, LFSR_S2.16B, LFSR_S2.16B
    eor LFSR_S3.16B, LFSR_S3.16B, LFSR_S3.16B
    eor LFSR_S4.16B, LFSR_S4.16B, LFSR_S4.16B
    eor LFSR_S5.16B, LFSR_S5.16B, LFSR_S5.16B
    eor LFSR_S6.16B, LFSR_S6.16B, LFSR_S6.16B
    eor LFSR_S7.16B, LFSR_S7.16B, LFSR_S7.16B
    eor LFSR_S8.16B, LFSR_S8.16B, LFSR_S8.16B
    eor LFSR_S9.16B, LFSR_S9.16B, LFSR_S9.16B
    eor LFSR_S10.16B, LFSR_S10.16B, LFSR_S10.16B
    eor LFSR_S11.16B, LFSR_S11.16B, LFSR_S11.16B
    eor LFSR_S12.16B, LFSR_S12.16B, LFSR_S12.16B
    eor LFSR_S13.16B, LFSR_S13.16B, LFSR_S13.16B
    eor LFSR_S14.16B, LFSR_S14.16B, LFSR_S14.16B
    eor LFSR_S15.16B, LFSR_S15.16B, LFSR_S15.16B
    eor FSM_R1.16B, FSM_R1.16B, FSM_R1.16B
    eor FSM_R2.16B, FSM_R2.16B, FSM_R2.16B
    eor FSM_R3.16B, FSM_R3.16B, FSM_R3.16B
.endm

.macro X_BYTE_STREAM IN, OUT, KEY, LEN
    cmp \LEN\(), #8
    b.lt 4f
    ld1 {vTMP0.D}[0], [\IN\()], #8
    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
    st1 {vTMP0.D}[0], [\OUT\()], #8
    mov \KEY\().D[0], \KEY\().D[1]
    sub \LEN\(), \LEN\(), #8
4:
    cmp \LEN\(), #4
    b.lt 2f
    ld1 {vTMP0.S}[0], [\IN\()], #4
    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
    st1 {vTMP0.S}[0], [\OUT\()], #4
    mov \KEY\().S[0], \KEY\().S[1]
    sub \LEN\(), \LEN\(), #4
2:
    cmp \LEN\(), #2
    b.lt 1f
    ld1 {vTMP0.H}[0], [\IN\()], #2
    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
    st1 {vTMP0.H}[0], [\OUT\()], #2
    mov \KEY\().H[0], \KEY\().H[1]
    sub \LEN\(), \LEN\(), #2
1:
    cmp \LEN\(), #1
    b.lt 0f
    ld1 {vTMP0.B}[0], [\IN\()], #1
    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
    st1 {vTMP0.B}[0], [\OUT\()], #1
0:
.endm

/*
 * snow3g_f8_4_buffer_initialize_aarch64_neon_asm(
 *              void *ctx,
 *              snow3g_key_schedule_t **pKeySched,
 *              void **pIV)
 */
START_FUNC(snow3g_f8_4_buffer_initialize_aarch64_neon_asm)
    FUNC_SCALAR_SAVE
    FUNC_VECTOR_SAVE
    adrp xTMP0, n_inv_aes_shift_row
    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
    ld1 {vINV_SHIFT_ROW.16B}, [xTMP0]
    mov xTMP8, x1
    mov xTMP9, x2

    ldp xTMP0, xTMP1, [xTMP8], #16
    ldp xTMP2, xTMP3, [xTMP8]
    ldp xTMP4, xTMP5, [xTMP9], #16
    ldp xTMP6, xTMP7, [xTMP9]

    SNOW3G_INITIALIZE_4_NEON_FIRST xTMP0 xTMP1 xTMP2 xTMP3 xTMP4 xTMP5 xTMP6 xTMP7
    SNOW3G_INITIALIZE_4_NEON_SECOND
    SNOW3G_STORE_CTX_4_NEON x0

    FUNC_VECTOR_RESTORE
    FUNC_SCALAR_RESTORE
    ret
END_FUNC(snow3g_f8_4_buffer_initialize_aarch64_neon_asm)

/*
 * snow3g_f8_4_buffer_stream_aarch64_neon_asm(void *ctx,
 *                                            void **in,
 *                                            void **out,
 *                                            uint32_t lengthInBytes)
 *
 */
START_FUNC(snow3g_f8_4_buffer_stream_aarch64_neon_asm)
    FUNC_SCALAR_SAVE
    FUNC_VECTOR_SAVE
    adrp xTMP0, n_inv_aes_shift_row
    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
    ld1 {vINV_SHIFT_ROW.16B}, [xTMP0]

    mov xTMP9, x1
    mov xTMP10, x2
    mov xTMP11, x3

    SNOW3G_LOAD_CTX_4_NEON x0
    ldp xTMP1, xTMP2, [xTMP9], #16
    ldp xTMP3, xTMP4, [xTMP9]
    ldp xTMP5, xTMP6, [xTMP10], #16
    ldp xTMP7, xTMP8, [xTMP10]

    cmp xTMP11, #16
    b.lt GEN8

GEN16_LOOP:
    SNOW3G_KEYSTREAM_4_4_NEON vTMP8
    SNOW3G_KEYSTREAM_4_4_NEON vTMP9
    SNOW3G_KEYSTREAM_4_4_NEON vTMP10
    SNOW3G_KEYSTREAM_4_4_NEON vTMP11
    zip1 vTMP0.4S, vTMP8.4S, vTMP9.4S
    zip2 vTMP1.4S, vTMP8.4S, vTMP9.4S
    zip1 vTMP2.4S, vTMP10.4S, vTMP11.4S
    zip2 vTMP3.4S, vTMP10.4S, vTMP11.4S
    zip1 vTMP8.2D, vTMP0.2D, vTMP2.2D
    zip2 vTMP9.2D, vTMP0.2D, vTMP2.2D
    zip1 vTMP10.2D, vTMP1.2D, vTMP3.2D
    zip2 vTMP11.2D, vTMP1.2D, vTMP3.2D
    rev32 vTMP8.16B, vTMP8.16B
    rev32 vTMP9.16B, vTMP9.16B
    rev32 vTMP10.16B, vTMP10.16B
    rev32 vTMP11.16B, vTMP11.16B

    ld1 {vTMP0.4S}, [xTMP1], #16
    ld1 {vTMP1.4S}, [xTMP2], #16
    ld1 {vTMP2.4S}, [xTMP3], #16
    ld1 {vTMP3.4S}, [xTMP4], #16
    eor vTMP0.16B, vTMP0.16B, vTMP8.16B
    eor vTMP1.16B, vTMP1.16B, vTMP9.16B
    eor vTMP2.16B, vTMP2.16B, vTMP10.16B
    eor vTMP3.16B, vTMP3.16B, vTMP11.16B
    st1 {vTMP0.4S}, [xTMP5], #16
    st1 {vTMP1.4S}, [xTMP6], #16
    st1 {vTMP2.4S}, [xTMP7], #16
    st1 {vTMP3.4S}, [xTMP8], #16

    sub xTMP11, xTMP11, #16
    cmp xTMP11, #16
    b.ge GEN16_LOOP

GEN8:
    cmp xTMP11, #8
    b.lt GEN4
    SNOW3G_KEYSTREAM_4_4_NEON vTMP8
    SNOW3G_KEYSTREAM_4_4_NEON vTMP9
    zip1 vTMP10.4S, vTMP8.4S, vTMP9.4S
    zip2 vTMP11.4S, vTMP8.4S, vTMP9.4S
    rev32 vTMP10.16B, vTMP10.16B
    rev32 vTMP11.16B, vTMP11.16B

    ld1 {vTMP0.D}[0], [xTMP1], #8
    ld1 {vTMP0.D}[1], [xTMP2], #8
    ld1 {vTMP1.D}[0], [xTMP3], #8
    ld1 {vTMP1.D}[1], [xTMP4], #8
    eor vTMP0.16B, vTMP0.16B, vTMP10.16B
    eor vTMP1.16B, vTMP1.16B, vTMP11.16B
    st1 {vTMP0.D}[0], [xTMP5], #8
    st1 {vTMP0.D}[1], [xTMP6], #8
    st1 {vTMP1.D}[0], [xTMP7], #8
    st1 {vTMP1.D}[1], [xTMP8], #8

    sub xTMP11, xTMP11, #8

GEN4:
    cmp xTMP11, #4
    b.lt FINISH
    SNOW3G_KEYSTREAM_4_4_NEON vTMP8
    rev32 vTMP8.16B, vTMP8.16B

    ld1 {vTMP0.S}[0], [xTMP1], #4
    ld1 {vTMP0.S}[1], [xTMP2], #4
    ld1 {vTMP0.S}[2], [xTMP3], #4
    ld1 {vTMP0.S}[3], [xTMP4], #4
    eor vTMP0.16B, vTMP0.16B, vTMP8.16B
    st1 {vTMP0.S}[0], [xTMP5], #4
    st1 {vTMP0.S}[1], [xTMP6], #4
    st1 {vTMP0.S}[2], [xTMP7], #4
    st1 {vTMP0.S}[3], [xTMP8], #4

FINISH:
    SNOW3G_STORE_CTX_4_NEON x0
    mov xTMP9, x1
    mov xTMP10, x2
    stp xTMP1, xTMP2, [xTMP9], #16
    stp xTMP3, xTMP4, [xTMP9]
    stp xTMP5, xTMP6, [xTMP10], #16
    stp xTMP7, xTMP8, [xTMP10]

    FUNC_VECTOR_RESTORE
    FUNC_SCALAR_RESTORE
    ret
END_FUNC(snow3g_f8_4_buffer_stream_aarch64_neon_asm)

/*
 * snow3g_f9_4_buffer_keystream_aarch64_neon_asm(void *pCtx,
 *                                               uint32_t* ks)
 *
 */
START_FUNC(snow3g_f9_4_buffer_keystream_aarch64_neon_asm)
    FUNC_SCALAR_SAVE
    FUNC_VECTOR_SAVE
    adrp xTMP0, n_inv_aes_shift_row
    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
    ld1 {vINV_SHIFT_ROW.16B}, [xTMP0]

    mov xTMP5, x1
    add xTMP6, x1, #20
    add xTMP7, x1, #40
    add xTMP8, x1, #60

    SNOW3G_LOAD_CTX_4_NEON x0

    SNOW3G_KEYSTREAM_4_4_NEON vTMP8
    SNOW3G_KEYSTREAM_4_4_NEON vTMP9
    SNOW3G_KEYSTREAM_4_4_NEON vTMP10
    SNOW3G_KEYSTREAM_4_4_NEON vTMP11
    zip1 vTMP0.4S, vTMP8.4S, vTMP9.4S
    zip2 vTMP1.4S, vTMP8.4S, vTMP9.4S
    zip1 vTMP2.4S, vTMP10.4S, vTMP11.4S
    zip2 vTMP3.4S, vTMP10.4S, vTMP11.4S
    zip1 vTMP8.2D, vTMP0.2D, vTMP2.2D
    zip2 vTMP9.2D, vTMP0.2D, vTMP2.2D
    zip1 vTMP10.2D, vTMP1.2D, vTMP3.2D
    zip2 vTMP11.2D, vTMP1.2D, vTMP3.2D

    st1 {vTMP8.4S}, [xTMP5], #16
    st1 {vTMP9.4S}, [xTMP6], #16
    st1 {vTMP10.4S}, [xTMP7], #16
    st1 {vTMP11.4S}, [xTMP8], #16

    SNOW3G_KEYSTREAM_4_4_NEON vTMP8

    st1 {vTMP8.S}[0], [xTMP5]
    st1 {vTMP8.S}[1], [xTMP6]
    st1 {vTMP8.S}[2], [xTMP7]
    st1 {vTMP8.S}[3], [xTMP8]

    FUNC_VECTOR_RESTORE
    FUNC_SCALAR_RESTORE
    ret
END_FUNC(snow3g_f9_4_buffer_keystream_aarch64_neon_asm)